import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv('Vehicle_Insurance (3).csv')
data
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28.0 | 0 | > 2 Years | Yes | 40454.0 | 26.0 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3.0 | 0 | 1-2 Year | No | 33536.0 | 26.0 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28.0 | 0 | > 2 Years | Yes | 38294.0 | 26.0 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11.0 | 1 | < 1 Year | No | 28619.0 | 152.0 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41.0 | 1 | < 1 Year | No | 27496.0 | 152.0 | 39 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | 381105 | Male | 74 | 1 | 26.0 | 1 | 1-2 Year | No | 30170.0 | 26.0 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37.0 | 1 | < 1 Year | No | 40016.0 | 152.0 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30.0 | 1 | < 1 Year | No | 35118.0 | 160.0 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14.0 | 0 | > 2 Years | Yes | 44617.0 | 124.0 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29.0 | 0 | 1-2 Year | No | 41777.0 | 26.0 | 237 | 0 |
381109 rows × 12 columns
data.shape
(381109, 12)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 381109 entries, 0 to 381108 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 381109 non-null int64 1 Gender 381109 non-null object 2 Age 381109 non-null int64 3 Driving_License 381109 non-null int64 4 Region_Code 381109 non-null float64 5 Previously_Insured 381109 non-null int64 6 Vehicle_Age 381109 non-null object 7 Vehicle_Damage 381109 non-null object 8 Annual_Premium 381109 non-null float64 9 Policy_Sales_Channel 381109 non-null float64 10 Vintage 381109 non-null int64 11 Response 381109 non-null int64 dtypes: float64(3), int64(6), object(3) memory usage: 34.9+ MB
data.count()
id 381109 Gender 381109 Age 381109 Driving_License 381109 Region_Code 381109 Previously_Insured 381109 Vehicle_Age 381109 Vehicle_Damage 381109 Annual_Premium 381109 Policy_Sales_Channel 381109 Vintage 381109 Response 381109 dtype: int64
data.head(10)
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28.0 | 0 | > 2 Years | Yes | 40454.0 | 26.0 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3.0 | 0 | 1-2 Year | No | 33536.0 | 26.0 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28.0 | 0 | > 2 Years | Yes | 38294.0 | 26.0 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11.0 | 1 | < 1 Year | No | 28619.0 | 152.0 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41.0 | 1 | < 1 Year | No | 27496.0 | 152.0 | 39 | 0 |
| 5 | 6 | Female | 24 | 1 | 33.0 | 0 | < 1 Year | Yes | 2630.0 | 160.0 | 176 | 0 |
| 6 | 7 | Male | 23 | 1 | 11.0 | 0 | < 1 Year | Yes | 23367.0 | 152.0 | 249 | 0 |
| 7 | 8 | Female | 56 | 1 | 28.0 | 0 | 1-2 Year | Yes | 32031.0 | 26.0 | 72 | 1 |
| 8 | 9 | Female | 24 | 1 | 3.0 | 1 | < 1 Year | No | 27619.0 | 152.0 | 28 | 0 |
| 9 | 10 | Female | 32 | 1 | 6.0 | 1 | < 1 Year | No | 28771.0 | 152.0 | 80 | 0 |
data.tail(10)
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 381099 | 381100 | Female | 51 | 1 | 28.0 | 0 | 1-2 Year | Yes | 44504.0 | 26.0 | 71 | 0 |
| 381100 | 381101 | Female | 29 | 1 | 28.0 | 0 | < 1 Year | Yes | 49007.0 | 124.0 | 137 | 0 |
| 381101 | 381102 | Female | 70 | 1 | 28.0 | 0 | > 2 Years | Yes | 50904.0 | 122.0 | 215 | 0 |
| 381102 | 381103 | Female | 25 | 1 | 41.0 | 1 | < 1 Year | Yes | 2630.0 | 152.0 | 102 | 0 |
| 381103 | 381104 | Male | 47 | 1 | 50.0 | 0 | 1-2 Year | Yes | 39831.0 | 26.0 | 235 | 0 |
| 381104 | 381105 | Male | 74 | 1 | 26.0 | 1 | 1-2 Year | No | 30170.0 | 26.0 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37.0 | 1 | < 1 Year | No | 40016.0 | 152.0 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30.0 | 1 | < 1 Year | No | 35118.0 | 160.0 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14.0 | 0 | > 2 Years | Yes | 44617.0 | 124.0 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29.0 | 0 | 1-2 Year | No | 41777.0 | 26.0 | 237 | 0 |
data.columns
Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
'Policy_Sales_Channel', 'Vintage', 'Response'],
dtype='object')
data.dtypes
id int64 Gender object Age int64 Driving_License int64 Region_Code float64 Previously_Insured int64 Vehicle_Age object Vehicle_Damage object Annual_Premium float64 Policy_Sales_Channel float64 Vintage int64 Response int64 dtype: object
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
categorical_columns
['Gender', 'Vehicle_Age', 'Vehicle_Damage']
numerical_columns = data.select_dtypes(include=['number']).columns.tolist()
numerical_columns
['id', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response']
unique_values = {col: data[col].unique() for col in data.columns}
unique_values
{'id': array([ 1, 2, 3, ..., 381107, 381108, 381109]),
'Gender': array(['Male', 'Female'], dtype=object),
'Age': array([44, 76, 47, 21, 29, 24, 23, 56, 32, 41, 71, 37, 25, 42, 60, 65, 49,
34, 51, 26, 57, 79, 48, 45, 72, 30, 54, 27, 38, 22, 78, 20, 39, 62,
58, 59, 63, 50, 67, 77, 28, 69, 52, 31, 33, 43, 36, 53, 70, 46, 55,
40, 61, 75, 64, 35, 66, 68, 74, 73, 84, 83, 81, 80, 82, 85]),
'Driving_License': array([1, 0]),
'Region_Code': array([28., 3., 11., 41., 33., 6., 35., 50., 15., 45., 8., 36., 30.,
26., 16., 47., 48., 19., 39., 23., 37., 5., 17., 2., 7., 29.,
46., 27., 25., 13., 18., 20., 49., 22., 44., 0., 9., 31., 12.,
34., 21., 10., 14., 38., 24., 40., 43., 32., 4., 51., 42., 1.,
52.]),
'Previously_Insured': array([0, 1]),
'Vehicle_Age': array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object),
'Vehicle_Damage': array(['Yes', 'No'], dtype=object),
'Annual_Premium': array([ 40454., 33536., 38294., ..., 20706., 101664., 69845.]),
'Policy_Sales_Channel': array([ 26., 152., 160., 124., 14., 13., 30., 156., 163., 157., 122.,
19., 22., 15., 154., 16., 52., 155., 11., 151., 125., 25.,
61., 1., 86., 31., 150., 23., 60., 21., 121., 3., 139.,
12., 29., 55., 7., 47., 127., 153., 78., 158., 89., 32.,
8., 10., 120., 65., 4., 42., 83., 136., 24., 18., 56.,
48., 106., 54., 93., 116., 91., 45., 9., 145., 147., 44.,
109., 37., 140., 107., 128., 131., 114., 118., 159., 119., 105.,
135., 62., 138., 129., 88., 92., 111., 113., 73., 36., 28.,
35., 59., 53., 148., 133., 108., 64., 39., 94., 132., 46.,
81., 103., 90., 51., 27., 146., 63., 96., 40., 66., 100.,
95., 123., 98., 75., 69., 130., 134., 49., 97., 38., 17.,
110., 80., 71., 117., 58., 20., 76., 104., 87., 84., 137.,
126., 68., 67., 101., 115., 57., 82., 79., 112., 99., 70.,
2., 34., 33., 74., 102., 149., 43., 6., 50., 144., 143.,
41.]),
'Vintage': array([217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, 289, 221,
15, 58, 147, 256, 299, 158, 102, 116, 177, 232, 60, 180, 49,
57, 223, 136, 222, 149, 169, 88, 253, 107, 264, 233, 45, 184,
251, 153, 186, 71, 34, 83, 12, 246, 141, 216, 130, 282, 73,
171, 283, 295, 165, 30, 218, 22, 36, 79, 81, 100, 63, 242,
277, 61, 111, 167, 74, 235, 131, 243, 248, 114, 281, 62, 189,
139, 138, 209, 254, 291, 68, 92, 52, 78, 156, 247, 275, 77,
181, 229, 166, 16, 23, 31, 293, 219, 50, 155, 66, 260, 19,
258, 117, 193, 204, 212, 144, 234, 206, 228, 125, 29, 18, 84,
230, 54, 123, 101, 86, 13, 237, 85, 98, 67, 128, 95, 89,
99, 208, 134, 135, 268, 284, 119, 226, 105, 142, 207, 272, 263,
64, 40, 245, 163, 24, 265, 202, 259, 91, 106, 190, 162, 33,
194, 287, 292, 69, 239, 132, 255, 152, 121, 150, 143, 198, 103,
127, 285, 214, 151, 199, 56, 59, 215, 104, 238, 120, 21, 32,
270, 211, 200, 197, 11, 213, 93, 113, 178, 10, 290, 94, 231,
296, 47, 122, 271, 278, 276, 96, 240, 172, 257, 224, 173, 220,
185, 90, 51, 205, 70, 160, 137, 168, 87, 118, 288, 126, 241,
82, 227, 115, 164, 236, 286, 244, 108, 274, 201, 97, 25, 174,
182, 154, 48, 20, 53, 17, 261, 41, 266, 35, 140, 269, 146,
145, 65, 298, 133, 195, 55, 188, 75, 38, 43, 110, 37, 129,
170, 109, 267, 279, 112, 280, 76, 191, 26, 161, 179, 175, 252,
42, 124, 187, 148, 294, 44, 157, 192, 262, 159, 210, 250, 14,
273, 297, 225, 196]),
'Response': array([1, 0])}
data.describe()
| id | Age | Driving_License | Region_Code | Previously_Insured | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|
| count | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 | 381109.000000 |
| mean | 190555.000000 | 38.822584 | 0.997869 | 26.388807 | 0.458210 | 30564.389581 | 112.034295 | 154.347397 | 0.122563 |
| std | 110016.836208 | 15.511611 | 0.046110 | 13.229888 | 0.498251 | 17213.155057 | 54.203995 | 83.671304 | 0.327936 |
| min | 1.000000 | 20.000000 | 0.000000 | 0.000000 | 0.000000 | 2630.000000 | 1.000000 | 10.000000 | 0.000000 |
| 25% | 95278.000000 | 25.000000 | 1.000000 | 15.000000 | 0.000000 | 24405.000000 | 29.000000 | 82.000000 | 0.000000 |
| 50% | 190555.000000 | 36.000000 | 1.000000 | 28.000000 | 0.000000 | 31669.000000 | 133.000000 | 154.000000 | 0.000000 |
| 75% | 285832.000000 | 49.000000 | 1.000000 | 35.000000 | 1.000000 | 39400.000000 | 152.000000 | 227.000000 | 0.000000 |
| max | 381109.000000 | 85.000000 | 1.000000 | 52.000000 | 1.000000 | 540165.000000 | 163.000000 | 299.000000 | 1.000000 |
data['Region_Code']=data['Region_Code'].astype(int)
data['Annual_Premium']=data['Annual_Premium'].astype(int)
data['Policy_Sales_Channel']=data['Policy_Sales_Channel'].astype(int)
data
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28 | 0 | > 2 Years | Yes | 40454 | 26 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3 | 0 | 1-2 Year | No | 33536 | 26 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28 | 0 | > 2 Years | Yes | 38294 | 26 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11 | 1 | < 1 Year | No | 28619 | 152 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41 | 1 | < 1 Year | No | 27496 | 152 | 39 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | 381105 | Male | 74 | 1 | 26 | 1 | 1-2 Year | No | 30170 | 26 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37 | 1 | < 1 Year | No | 40016 | 152 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30 | 1 | < 1 Year | No | 35118 | 160 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14 | 0 | > 2 Years | Yes | 44617 | 124 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29 | 0 | 1-2 Year | No | 41777 | 26 | 237 | 0 |
381109 rows × 12 columns
data.rename(columns={'Vehicle_Age':'Vehicle_Age(years)'},inplace=True)
data
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age(years) | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28 | 0 | > 2 Years | Yes | 40454 | 26 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3 | 0 | 1-2 Year | No | 33536 | 26 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28 | 0 | > 2 Years | Yes | 38294 | 26 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11 | 1 | < 1 Year | No | 28619 | 152 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41 | 1 | < 1 Year | No | 27496 | 152 | 39 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | 381105 | Male | 74 | 1 | 26 | 1 | 1-2 Year | No | 30170 | 26 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37 | 1 | < 1 Year | No | 40016 | 152 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30 | 1 | < 1 Year | No | 35118 | 160 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14 | 0 | > 2 Years | Yes | 44617 | 124 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29 | 0 | 1-2 Year | No | 41777 | 26 | 237 | 0 |
381109 rows × 12 columns
data['Vehicle_Age(years)']=data['Vehicle_Age(years)'].str.rstrip('Years')
data['Vehicle_Age(years)']=data['Vehicle_Age(years)'].str.rstrip('year')
data
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age(years) | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28 | 0 | > 2 | Yes | 40454 | 26 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3 | 0 | 1-2 | No | 33536 | 26 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28 | 0 | > 2 | Yes | 38294 | 26 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11 | 1 | < 1 | No | 28619 | 152 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41 | 1 | < 1 | No | 27496 | 152 | 39 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | 381105 | Male | 74 | 1 | 26 | 1 | 1-2 | No | 30170 | 26 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37 | 1 | < 1 | No | 40016 | 152 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30 | 1 | < 1 | No | 35118 | 160 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14 | 0 | > 2 | Yes | 44617 | 124 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29 | 0 | 1-2 | No | 41777 | 26 | 237 | 0 |
381109 rows × 12 columns
data.duplicated().sum()
0
data.isnull()
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age(years) | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | False | False | False | False | False | False | False | False | False | False | False | False |
| 381105 | False | False | False | False | False | False | False | False | False | False | False | False |
| 381106 | False | False | False | False | False | False | False | False | False | False | False | False |
| 381107 | False | False | False | False | False | False | False | False | False | False | False | False |
| 381108 | False | False | False | False | False | False | False | False | False | False | False | False |
381109 rows × 12 columns
data.isnull().sum()
id 0 Gender 0 Age 0 Driving_License 0 Region_Code 0 Previously_Insured 0 Vehicle_Age(years) 0 Vehicle_Damage 0 Annual_Premium 0 Policy_Sales_Channel 0 Vintage 0 Response 0 dtype: int64
data
| id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age(years) | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Male | 44 | 1 | 28 | 0 | > 2 | Yes | 40454 | 26 | 217 | 1 |
| 1 | 2 | Male | 76 | 1 | 3 | 0 | 1-2 | No | 33536 | 26 | 183 | 0 |
| 2 | 3 | Male | 47 | 1 | 28 | 0 | > 2 | Yes | 38294 | 26 | 27 | 1 |
| 3 | 4 | Male | 21 | 1 | 11 | 1 | < 1 | No | 28619 | 152 | 203 | 0 |
| 4 | 5 | Female | 29 | 1 | 41 | 1 | < 1 | No | 27496 | 152 | 39 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 381104 | 381105 | Male | 74 | 1 | 26 | 1 | 1-2 | No | 30170 | 26 | 88 | 0 |
| 381105 | 381106 | Male | 30 | 1 | 37 | 1 | < 1 | No | 40016 | 152 | 131 | 0 |
| 381106 | 381107 | Male | 21 | 1 | 30 | 1 | < 1 | No | 35118 | 160 | 161 | 0 |
| 381107 | 381108 | Female | 68 | 1 | 14 | 0 | > 2 | Yes | 44617 | 124 | 74 | 0 |
| 381108 | 381109 | Male | 46 | 1 | 29 | 0 | 1-2 | No | 41777 | 26 | 237 | 0 |
381109 rows × 12 columns
# Before handling outliers
sns.boxplot(data['Annual_Premium'])
plt.xlabel('Finding Outliers in Annual_Premium')
plt.show()
# Function to cap outliers using IQR method
def cap_outliers(data, column):
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data[column] = data[column].clip(lower=lower_bound, upper=upper_bound)
return data
# Handling outliers in Annual_Premium column
data = cap_outliers(data, 'Annual_Premium')
# After handling outliers
sns.boxplot(data['Annual_Premium'],color='#72a3fe')
plt.xlabel('Finding Outliers in Annual_Premium')
plt.show()
sns.histplot(data['Age'],bins=30,kde=True,color='#DE3163')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Distribution')
plt.show()
sns.histplot(data['Annual_Premium'],bins=30,kde=True,color='#48A6A7')
plt.title('Distribution of Annual Premium')
plt.xlabel('Annual_Premium')
plt.ylabel('Frequency')
plt.show()
sns.kdeplot(data['Vintage'],shade=True,color='#3a5a40')
plt.title('KDE Plot of Vintage')
plt.xlabel('Vintage')
plt.show()
sns.kdeplot(data['Policy_Sales_Channel'],color='#800f2f')
plt.title('Distribution of Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Distribution')
plt.show()
sns.barplot(data=data,x='Gender',y='Age',ci=None,palette=['#A0E7E5','#B4F8C8'])
plt.show()
sns.histplot(data=data,x='Age',bins=range(0,51,5),hue='Previously_Insured',element='poly',palette='deep')
plt.title('Frequency Distribution of Age with respect to Previously_Insured')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
sns.histplot(data=data,x='Policy_Sales_Channel',bins=range(0,51,5),hue='Vehicle_Damage',element='step',palette='pastel')
plt.title('Distribution of Policy Sales Channel')
plt.xlabel('Policy_Sales_Channel')
plt.ylabel('Frequency(Count)')
plt.show()
sns.kdeplot(data=data,x='Policy_Sales_Channel',fill=True,hue='Previously_Insured',palette=['#2ECC71','#9B59B6'])
plt.title('KDE Plot of Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.show()
sns.lineplot(data=data,x='Vehicle_Age(years)',y='Policy_Sales_Channel',hue='Vehicle_Damage')
plt.title('Policy Sales Channel with respect to Age')
plt.xlabel('Age')
plt.ylabel('Policy_Sales_Channel')
plt.show()
sns.countplot(x=data['Vehicle_Age(years)'], palette=['#f0f7e0','#d3bbdd','#bc96ca'])
plt.title('Vehicle Age Distribution')
plt.xlabel('Vehicle Age')
plt.ylabel('Count')
plt.show()
plt.figure(figsize=(5,5))
sns.countplot(x=data['Gender'], palette=['#deb3ad','#eb7c8f'])
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
sns.boxplot(data['Response'])
plt.xlabel('Finding Outliers in Response')
plt.show()
sns.boxplot(data['Driving_License'])
plt.xlabel('Finding Outliers in Driving_License')
plt.show()
data["Response"].value_counts()
Response 0 334399 1 46710 Name: count, dtype: int64
# Distribution of Target Variable
sns.countplot(x=data['Response'],palette=['#bbc0b6','#d58469'])
plt.title('Distribution of Insurance Claims(Response)')
plt.xlabel('Response (0 = No Claim, 1 = Claim)')
plt.ylabel('Count')
plt.show()
# Vehicle Age Vs Inurance Claims(Response)
sns.countplot(x='Response', hue='Vehicle_Age(years)',data=data, palette="pastel")
plt.title('Impact of Gender on Insurance Claims')
plt.xlabel('Gender')
plt.ylabel('Average Response Rate')
plt.show()
# showing all the feature analysis
sns.pairplot(data, diag_kind='kde')
plt.show()
# histplot of age vs response
sns.histplot(data, x='Age', hue='Response', kde=True, bins=30, palette='Reds')
plt.title('Age Distribution by Response')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# scatterplot of annual_premium and vintage vs response(doubt)
sns.scatterplot(x=data['Annual_Premium'], y=data['Vintage'], hue=data['Response'], palette='Set1')
plt.title('Annual Premium vs Vintage (Colored by Response)')
plt.xlabel('Annual Premium')
plt.ylabel('Vintage')
plt.show()
# kde plot Annual Premium Vs Response
sns.kdeplot(data=data, x='Annual_Premium', hue='Response',fill=True, palette='BrBG')
plt.title('KDE Plot: Annual Premium by Response')
plt.xlabel('Annual_Premium')
plt.ylabel('Density')
plt.show()
# boxplot of vintage vs response
sns.boxplot(x=data['Response'], y=data['Vintage'], palette=['#f0ede4','#435861'])
plt.title('Vintage Vs Response')
plt.xlabel('Insurance Claim (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.show()
# displot of vintage vs response
sns.displot(data=data,x='Vintage',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set1')
plt.show()
# distplot of region code vs response
sns.displot(data=data,x='Region_Code',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set2')
plt.show()
# displot of policy sales channel and response
sns.displot(data=data,x='Policy_Sales_Channel',fill=True,hue='Response',kind='kde')
plt.show()
# Age Distribution
plt.figure(figsize=(8, 6))
sns.histplot(data['Age'],bins=30,kde=True,color='#000c66')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Distribution')
plt.show()
# Impact of Age on Insurance Claims
plt.figure(figsize=(8, 5))
sns.boxplot(x=data['Response'], y=data['Age'], palette=['#fbe0e0'])
plt.title('Impact of Age on Insurance Claims')
plt.xlabel('Insurance Claim (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.show()
# Distribution of Annual Premium
sns.histplot(data['Annual_Premium'],bins=30,kde=True,color='#5e4d50')
plt.title('Distribution of Annual Premium')
plt.xlabel('Annual_Premium')
plt.ylabel('Frequency')
plt.show()
# Correlation of Annual Premium with claim frequencies(Response)
plt.figure(figsize=(8, 5))
sns.boxplot(x=data['Response'], y=data['Annual_Premium'], palette=['#3d3019','#e3a60a'])
plt.title('Impact of Annual Premium on Insurance Claims')
plt.xlabel('Insurance Claim(Response)')
plt.ylabel('Annual Premium')
plt.show()
# Claim Fequency Distribution
plt.figure(figsize=(5,4))
sns.countplot(x='Response',data=data,palette='Set3')
plt.title('Claim Frequencies')
plt.xlabel('Insurance Claim (Response)')
plt.ylabel('Count')
plt.show()
# PieChart of claims by gender
gender_claims = data.groupby('Gender')['Response'].count()
plt.pie(gender_claims, labels=gender_claims.index, autopct='%1.1f%%', colors=['#004369', '#db1f48'], startangle=90)
plt.title('Proportion of Claims by Gender')
plt.show()
# Vehicle Damage vs Claim Frequency
plt.figure(figsize=(6, 4))
sns.barplot(x='Vehicle_Damage', y='Response', data=data, ci=None)
plt.title('Claims by Vehicle Damage')
plt.xlabel('Vehicle Damage')
plt.ylabel('Mean Claim Response')
plt.show()
# stacked barplot showing the claim frequency by vehicle age(in years)
data_grouped = data.groupby(['Vehicle_Age(years)', 'Response']).size().unstack()
colors = ['#a47551','#e4d4c8']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Claim Frequency by Vehicle Age')
plt.xlabel('Vehicle Age(years)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Claim (1=Yes, 0=No)', fontsize=5)
plt.show()
# pointplot showing the frequency of driving license and claim
sns.pointplot(x='Driving_License', y='Response', data=data,palette='coolwarm')
plt.title('Driving License and Claim Frequency')
plt.xlabel('Driving License (0 = No, 1 = Yes)')
plt.ylabel('Claim Frequency')
plt.show()
# distribution of previously insured and inurance claim(response)
sns.displot(data=data,x='Previously_Insured',bins=range(0,51,5),fill=True,hue='Response',kind='hist',element='poly',palette='Set2')
plt.show()
# stacked barplot showing the claim frequency by gender
data_grouped = data.groupby(['Gender', 'Response']).size().unstack()
colors = ['#6f6fa6','#f2e0d5']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title("Stacked Bar Plot of Claims by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.legend(title="Claim (1=Yes, 0=No)", fontsize=6)
plt.show()
# impact of vehicle age on claim
sns.boxplot(x='Vehicle_Age(years)', y='Response', data=data)
plt.title('Impact of Vehicle Age on Claim Likelihood')
plt.show()
# proportion of claims by vehicle age
vehicle_claims = data.groupby('Vehicle_Age(years)')['Response'].mean()
plt.pie(vehicle_claims, labels=vehicle_claims.index, autopct='%1.1f%%', colors=['#90adc6','#e9eaec','#fad02c'], startangle=90)
plt.title('Proportion of Claims by Vehicle Age')
plt.show()
# Region-wise Analysis
plt.figure(figsize=(8, 5))
sns.barplot(x='Region_Code', y='Response', data=data, ci=None)
plt.title('Claims by Region')
plt.xlabel('Region Code')
plt.ylabel('Claim Response')
plt.xticks(rotation=90)
plt.show()
# policy wise analysis
plt.figure(figsize=(8, 5))
sns.histplot(data, x='Policy_Sales_Channel', hue='Response', kde=True, bins=20, palette='OrRd')
plt.title('Histogram of Claims by Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Count')
plt.show()
# stacked barplot showing the claim frequency by vehicle damage
data_grouped = data.groupby(['Vehicle_Damage', 'Response']).size().unstack()
colors = ['#51a7ad','#f7a7bf']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Claim Frequency by Vehicle Damage')
plt.xlabel('Vehicle Damage')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
# KDE Plot of Claims by Policy Sales Channel
plt.figure(figsize=(8, 5))
sns.kdeplot(data=data, x='Policy_Sales_Channel', hue='Response', fill=True,palette='muted')
plt.title('KDE Plot of Claims by Policy Sales Channel')
plt.xlabel('Policy Sales Channel')
plt.ylabel('Density')
plt.show()
# vintage vs response distribution
sns.displot(data=data,x='Vintage',bins=range(0,51,5),kind='hist',col='Response',element='bars',hue='Response',palette='Set3')
plt.ylabel('Frequency(Count)')
plt.show()
# previously insured vs response
data_grouped = data.groupby(['Previously_Insured', 'Response']).size().unstack()
colors = ['#ffc0d3','#000000']
data_grouped.plot(kind='bar', stacked=True, figsize=(6,4), color=colors)
plt.title('Previously Insured Vs Response')
plt.xlabel('Previously_Insured')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()